We're using standard python data tools and the imbalanced-learn contrib toolkit. The notebook includes analysis and techniques presented across multiple sources and where possible, is labeled with a link to the underlying paper, code or technique.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.utils.multiclass import unique_labels
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from imblearn.pipeline import make_pipeline
from imblearn.base import BaseSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import AllKNN
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
def create_dataset(n_samples=1000, weights=(0.02, 0.98), n_classes=2, class_sep=0.8, n_clusters=1):
return make_classification(n_samples=n_samples, n_features=2,
n_informative=2, n_redundant=0, n_repeated=0,
n_classes=n_classes,
n_clusters_per_class=n_clusters,
weights=list(weights),
class_sep=class_sep, random_state=0)
def plot_resampling(X, y, sampling, ax):
X_res, y_res = sampling.fit_resample(X, y)
ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
# make nice plotting
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))
return Counter(y_res)
def plot_decision_function(X, y, clf, ax):
plot_step = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.4)
ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k')
def plot_comparison(sampler, X, y):
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(sampler, LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for {}'.format(type(sampler).__name__))
plot_resampling(X, y, sampler, ax3)
ax3.set_title('Resampling using {}'.format(type(sampler).__name__))
fig.tight_layout()
return pipe
https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def confusion(technique, df, yHat, y):
confusionMatrix = confusion_matrix(y, yHat)
classReport = classification_report_imbalanced(y, yHat)
majPrecision = confusionMatrix[0][0] / (confusionMatrix[0][0] + confusionMatrix[1][0])
majRecall = confusionMatrix[0][0] / (confusionMatrix[0][0] + confusionMatrix[0][1])
minPrecision = confusionMatrix[1][1] / (confusionMatrix[1][1] + confusionMatrix[0][1])
minRecall = confusionMatrix[1][1] / (confusionMatrix[1][1] + confusionMatrix[1][0])
df.loc[technique] = [majPrecision, majRecall, minPrecision, minRecall]
print(confusionMatrix)
print(classReport)
return confusionMatrix, classReport, (majPrecision, majRecall), (minPrecision, minRecall)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
ax_arr = (ax1, ax2, ax3, ax4)
weights_arr = ((0.5, 0.5), (0.75, 0.25), (0.85, 0.15), (0.95, 0.05))
for ax, weights in zip(ax_arr, weights_arr):
X, y = create_dataset(n_samples=1000, weights=weights, class_sep=0.4)
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax)
ax.set_title('Linear SVC with y={}'.format(Counter(y)))
fig.tight_layout()
# Keep a summary of how well different techniques work
resultsDF = pd.DataFrame(columns=['Maj Precision', 'Maj Recall', 'Min Precision', 'Min Recall'])
fig, (ax1) = plt.subplots(1, 1, figsize=(15, 7))
X, y = create_dataset(n_samples=1000, weights=(0.95, 0.05), class_sep=0.4)
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
con, cr, majority, minority = confusion('Baseline', resultsDF, pipe.predict(X), y)
resultsDF
Achieve balance by increasing the number of minority classes which reduces the imbalance ratio.
sampler = RandomOverSampler(random_state=0)
pipe = plot_comparison(sampler, X, y)
con, cr, majority, minority = confusion('Random Over Sampling', resultsDF, pipe.predict(X), y)
resultsDF
sampler = SMOTE()
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('SMOTE', resultsDF, pipe.predict(X), y)
resultsDF
sampler = BorderlineSMOTE(random_state=0, kind='borderline-1')
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('Borderline SMOTE 1', resultsDF, pipe.predict(X), y)
resultsDF
sampler = BorderlineSMOTE(random_state=0, kind='borderline-2')
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('Borderline SMOTE 2', resultsDF, pipe.predict(X), y)
resultsDF
SVM-SMOTE focuses on generating new minority class instances near borderlines with SVM so as to help establish boundary between classes. https://medium.com/vclab/tackling-class-imbalance-with-svm-smote-efa41ec3de5f
sampler = SVMSMOTE(random_state=0)
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('SVMSMOTE', resultsDF, pipe.predict(X), y)
resultsDF
"SMOTE and ADASYN generate new samples in by interpolation. However, the samples used to interpolate/generate new synthetic samples differ. In fact, ADASYN focuses on generating samples next to the original samples which are wrongly classified using a k-Nearest Neighbors classifier while the basic implementation of SMOTE will not make any distinction between easy and hard samples to be classified using the nearest neighbors rule. Therefore, the decision function found during training will be different among the algorithms." https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#cbhk2002
sampler = ADASYN()
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('ADASYN', resultsDF, pipe.predict(X), y)
resultsDF
Achieve balance by reducing the number of majority classes datapoints which reduces the imbalance ratio.
sampler = RandomUnderSampler(random_state=0)
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('Random Under Sampling', resultsDF, pipe.predict(X), y)
resultsDF
sampler = TomekLinks()
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('TomekLinks', resultsDF, pipe.predict(X), y)
resultsDF
Near Miss algorithms implement some heuristic rules in order to select samples.
sampler = NearMiss(version=1)
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('NearMiss-1', resultsDF, pipe.predict(X), y)
resultsDF
sampler = NearMiss(version=2)
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('NearMiss-2', resultsDF, pipe.predict(X), y)
resultsDF
sampler = NearMiss(version=3)
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('NearMiss-3', resultsDF, pipe.predict(X), y)
resultsDF
http://cgm.cs.mcgill.ca/~godfried/teaching/projects.pr.98/sergei/project.html
sampler = EditedNearestNeighbours()
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('Edited Nearest Neighbors', resultsDF, pipe.predict(X), y)
resultsDF
sampler = RepeatedEditedNearestNeighbours()
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('Repeated Edited Nearest Neighbors', resultsDF, pipe.predict(X), y)
resultsDF
sampler = AllKNN(allow_minority=True)
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('AllKNN', resultsDF, pipe.predict(X), y)
resultsDF
sampler = SMOTEENN()
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('SMOTE+ENN', resultsDF, pipe.predict(X), y)
resultsDF
sampler = SMOTETomek()
pipe = plot_comparison(sampler, X, y)
con, cr, prec, rec = confusion('SMOTE+TomekLinks', resultsDF, pipe.predict(X), y)
resultsDF
Some techniques such as XGBoost allow you to scale the weights of the minority class, achieving balance in the weight of all samples rather than in the count of samples.
resultsDF.sort_values('Maj Precision', ascending=False).head(3)
resultsDF.sort_values('Min Recall', ascending=False).head(3)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(SMOTE(random_state=0), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for SMOTE')
plot_resampling(X, y, SMOTE(random_state=0), ax3)
ax3.set_title('Resampling using SMOTE')
fig.tight_layout()
"ROC curves are appropriate when the observations are balanced between each class, whereas precision-recall curves are appropriate for imbalanced datasets. In both cases the area under the curve (AUC) can be used as a summary of the model performance." http://www.davidsbatista.net/blog/2018/08/19/NLP_Metrics/
Practical Guidelines
#Generate Probabilities
pipe = make_pipeline(SMOTE(kind='borderline-1'), SVC(probability=True, gamma='auto'))
pipe.fit(X, y)
probs = pipe.predict_proba(X)
probs = probs[:, 1]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
#Imbalanced ROC Curve
fpr, tpr, thresholds = roc_curve(y, probs)
ax1.plot([0, 1], [0, 1], linestyle='--')
ax1.plot(fpr, tpr, marker='.')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curve - AUC: %.3f' % roc_auc_score(y, probs))
#Precision Recall Curve
precision, recall, thresholds = precision_recall_curve(y, probs)
ax2.plot([0, 1], [0.5, 0.5], linestyle='--')
ax2.plot(recall, precision, marker='.')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision Recall Curve - AUC: %.3f' % auc(recall, precision))